The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(zoo)
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
## data extracted from New York Times state-level data from NYT Github repository# https://github.com/nytimes/covid-19-data## state-level population information from us_census_data available on GitHub repository:# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data### FINISH THE CODE HERE #### load COVID state-level data from NYTcv_states <-as.data.frame(read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))### FINISH THE CODE HERE #### load state population datastate_pops <-as.data.frame(read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))state_pops$abb <- state_pops$statestate_pops$state <- state_pops$state_namestate_pops$state_name <-NULL
'data.frame': 61942 obs. of 5 variables:
$ date : chr "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" ...
$ state : chr "Washington" "Washington" "Washington" "Illinois" ...
$ fips : int 53 53 53 17 53 6 17 53 4 6 ...
$ cases : int 1 1 1 1 1 1 1 1 1 2 ...
$ deaths: int 0 0 0 0 0 0 0 0 0 0 ...
### FINISH THE CODE HEREcv_states <-merge(# Datax = cv_states, y = state_pops,# List of variables to matchby.x ="state",by.y ="state", # Which obs to keep?all.x =TRUE, all.y =TRUE )
Step 2: Look at the data
The data appears to be in correct format.
dim(cv_states)
[1] 61942 9
head(cv_states)
state date fips cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04 1 1587224 21263 1 4887871 96.50939 AL
2 Alabama 2020-04-25 1 6213 213 1 4887871 96.50939 AL
3 Alabama 2023-02-26 1 1638348 21400 1 4887871 96.50939 AL
4 Alabama 2022-12-03 1 1549285 21129 1 4887871 96.50939 AL
5 Alabama 2020-05-06 1 8691 343 1 4887871 96.50939 AL
6 Alabama 2021-04-21 1 524367 10807 1 4887871 96.50939 AL
# format the datecv_states$date <-as.Date(cv_states$date, format="%Y-%m-%d")# format the state and state abbreviation (abb) variablesstate_list <-unique(cv_states$state)cv_states$state <-factor(cv_states$state, levels = state_list)abb_list <-unique(cv_states$abb)cv_states$abb <-factor(cv_states$abb, levels = abb_list)### FINISH THE CODE HERE # order the data first by state, second by datecv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formattedtail(cv_states)
Based on observations of the dataset structure, I can confirm that the variables are correctly formatted.
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?head(cv_states)
state date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
597 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
282 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
12 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
266 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
78 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
state date fips cases
Washington : 1158 Min. :2020-01-21 Min. : 1.00 Min. : 1
Illinois : 1155 1st Qu.:2020-12-09 1st Qu.:17.00 1st Qu.: 70784
California : 1154 Median :2021-09-16 Median :31.00 Median : 351496
Arizona : 1153 Mean :2021-09-13 Mean :32.25 Mean : 889830
Massachusetts: 1147 3rd Qu.:2022-06-20 3rd Qu.:46.00 3rd Qu.: 1043231
Wisconsin : 1143 Max. :2023-03-23 Max. :78.00 Max. :12169158
(Other) :55032
deaths geo_id population pop_density
Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
1st Qu.: 1171 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
Median : 5035 Median :29.00 Median : 4468402 Median : 107.860
Mean : 11779 Mean :29.78 Mean : 6397965 Mean : 423.031
3rd Qu.: 14912 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
Max. :104277 Max. :72.00 Max. :39557045 Max. :11490.120
NA's :3848 NA's :3848 NA's :4954
abb
WA : 1158
IL : 1155
CA : 1154
AZ : 1153
MA : 1147
(Other):52327
NA's : 3848
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"
The range of variables are listed in the following codes:
diff(range(cv_states$date))
Time difference of 1157 days
diff(range(cv_states$cases))
[1] 12169157
diff(range(cv_states$deaths))
[1] 104277
Step 4: Add new_cases and new_deaths and correct outliers
# Add variables for new_cases and new_deaths:for (i in1:length(state_list)) { cv_subset =subset(cv_states, state == state_list[i]) cv_subset = cv_subset[order(cv_subset$date),]}# add starting level for new cases and deaths cv_subset$new_cases = cv_subset$cases[1] cv_subset$new_deaths = cv_subset$deaths[1]### FINISH THE CODE HEREfor (j in2:nrow(cv_subset)) { cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$new_cases[j-1] cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$new_deaths[j-1] }# include in main dataset cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths# Focus on recent datescv_states <- cv_states %>% dplyr::filter(date >="2021-06-01")
### FINISH THE CODE HERE# Inspect outliers in new_cases using plotlyp1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +geom_line() +geom_point(size = .5, alpha =0.5)ggplotly(p1)
p2<-NULL# to clear from workspace# set negative new case or death counts to 0cv_states$new_cases[cv_states$new_cases<0] =0cv_states$new_deaths[cv_states$new_deaths<0] =0# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`for (i in1:length(state_list)) { cv_subset =subset(cv_states, state == state_list[i])# add starting level for new cases and deaths cv_subset$cases = cv_subset$cases[1] cv_subset$deaths = cv_subset$deaths[1]}
### FINISH CODE HEREfor (j in2:nrow(cv_subset)) { cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j] cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j]# include in main dataset cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths}# Smooth new countscv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>%round(digits =0)cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>%round(digits =0)# Inspect data again interactivelyp2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +geom_line() +geom_point(size = .5, alpha =0.5)ggplotly(p2)
#p2=NULL
Step 5: Add additional variables
### FINISH CODE HERE# add population normalized (by 100,000) counts for each variablecv_states$per100k =as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / casescv_states = cv_states %>%mutate(naive_CFR =round((deaths*100/cases),2))# create a `cv_states_today` variablecv_states_today =subset(cv_states, date==max(cv_states$date))
Section 2: Scatterplots
Step 6: Explore scatterplots using plot_ly()
### FINISH CODE HERE# pop_density vs. casescv_states_today %>%plot_ly(x =~pop_density, y =~cases, type ='scatter', mode ='markers', color =~state,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 5 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# filter out "District of Columbia"cv_states_today_filter <- cv_states_today %>%filter(state!="District of Columbia")# pop_density vs. cases after filteringcv_states_today_filter %>%plot_ly(x =~pop_density, y =~cases, type ='scatter', mode ='markers', color =~state,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 5 observations
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# pop_density vs. deathsper100kcv_states_today_filter %>%plot_ly(x =~pop_density, y =~deathsper100k,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5))
No trace type specified:
Based on info supplied, a 'scatter' trace seems appropriate.
Read more about this trace type -> https://plotly.com/r/reference/#scatter
No scatter mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Warning: Ignoring 5 observations
Warning: `line.width` does not currently support multiple values.
# Adding hoverinfocv_states_today_filter %>%plot_ly(x =~pop_density, y =~deathsper100k,type ='scatter', mode ='markers', color =~state,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5),hoverinfo ='text',text =~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") , paste(" Deaths per 100k: ", per100k, sep=""), sep ="<br>")) %>%layout(title ="Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",yaxis =list(title ="Deaths per 100k"), xaxis =list(title ="Population Density"),hovermode ="compare")
Warning: Ignoring 5 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Step 7: Explore scatterplot trend interactively using ggplotly() and geom_smooth()
There seems to be a positive relationship with population density and deaths (per 100k)
Step 8: Multiple line chart
### FINISH CODE HERE# Line chart for naive_CFR for all states over time using `plot_ly()`plot_ly(cv_states, x =~date, y =~naive_CFR, color =~state, type ="scatter", mode ="lines")
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
After approximately Jan 2022, most states seem to level out in naive_CFR
### FINISH CODE HERE# Line chart for Florida showing new_cases and new_deaths togethercv_states %>%filter(state=="Florida") %>%plot_ly(x =~date, y =~cases, type ="scatter", mode ="lines") %>%add_trace(x =~date, y =~new_deaths, type ="scatter", mode ="lines")
** used chatgpt to find the add_trace function
Step 9: Heatmaps
### FINISH CODE HERE# Map state, date, and new_cases to a matrixlibrary(tidyr)cv_states_mat <- cv_states %>%select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2021-06-15"))cv_states_mat2 <-as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = date))
Warning: Values from `date` are not uniquely identified; output will contain list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
{data} %>%
dplyr::group_by(new_cases, state) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::filter(n > 1L)
rownames(cv_states_mat2) <- cv_states_mat2$datecv_states_mat2$date <-NULLcv_states_mat2 <-as.matrix(cv_states_mat2)# Create a heatmap using plot_ly()plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),z=~cv_states_mat2,type="heatmap",showscale=T)
Warning: Values from `date` are not uniquely identified; output will contain list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
{data} %>%
dplyr::group_by(newper100k, state) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::filter(n > 1L)
# Create a second heatmap after filtering to only include dates every other weekfilter_dates <-seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by="weeks")cv_states_mat <- cv_states %>%select(state, date, newper100k) %>%filter(date %in% filter_dates)cv_states_mat2 <-as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = date))
Warning: Values from `date` are not uniquely identified; output will contain list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
{data} %>%
dplyr::group_by(newper100k, state) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::filter(n > 1L)
rownames(cv_states_mat2) <- cv_states_mat2$datecv_states_mat2$date <-NULLcv_states_mat2 <-as.matrix(cv_states_mat2)# Create a heatmap using plot_ly()plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),z=~cv_states_mat2,type="heatmap",showscale=T)
I was stuck on this code so could not figure out which states stand out the most.
Step 10: Map
### For specified datepick.date ="2021-10-15"# Extract the data for each state by its abbreviationcv_per100 <- cv_states %>%filter(date==pick.date) %>%select(state, abb, newper100k, cases, deaths) # select datacv_per100$state_name <- cv_per100$statecv_per100$state <- cv_per100$abbcv_per100$abb <-NULL# Create hover textcv_per100$hover <-with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))# Set up mapping detailsset_map_details <-list(scope ='usa',projection =list(type ='albers usa'),showlakes =TRUE,lakecolor =toRGB('white'))# Make sure both maps are on the same color scaleshadeLimit <-125# Create the mapfig <-plot_geo(cv_per100, locationmode ='USA-states') %>%add_trace(z =~newper100k, text =~hover, locations =~state,color =~newper100k, colors ='Purples' )fig <- fig %>%colorbar(title =paste0("Cases per 100k: ", pick.date), limits =c(0,shadeLimit))
Warning: Ignoring 55 observations
fig <- fig %>%layout(title =paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),geo = set_map_details )fig_pick.date <- fig################ Map for today's date# Extract the data for each state by its abbreviationcv_per100 <- cv_states_today %>%select(state, abb, newper100k, cases, deaths) # select datacv_per100$state_name <- cv_per100$statecv_per100$state <- cv_per100$abbcv_per100$abb <-NULL# Create hover textcv_per100$hover <-with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))# Set up mapping detailsset_map_details <-list(scope ='usa',projection =list(type ='albers usa'),showlakes =TRUE,lakecolor =toRGB('white'))# Create the mapfig <-plot_geo(cv_per100, locationmode ='USA-states') %>%add_trace(z =~newper100k, text =~hover, locations =~state,color =~newper100k, colors ='Purples' )fig <- fig %>%colorbar(title =paste0("Cases per 100k: ", Sys.Date()), limits =c(0,shadeLimit))
Warning: Ignoring 55 observations
fig <- fig %>%layout(title =paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),geo = set_map_details )fig_Today <- fig### Plot together subplot(fig_pick.date, fig_Today, nrows =2, margin = .05)
I did not do this code correctly, so I cannot compare the two maps.